Part I

load data

df <- read.csv("BIKE DETAILS.csv", as.is=TRUE)
head(df)
##                                  name selling_price year seller_type     owner
## 1           Royal Enfield Classic 350        175000 2019  Individual 1st owner
## 2                           Honda Dio         45000 2017  Individual 1st owner
## 3 Royal Enfield Classic Gunmetal Grey        150000 2018  Individual 1st owner
## 4   Yamaha Fazer FI V 2.0 [2016-2018]         65000 2015  Individual 1st owner
## 5               Yamaha SZ [2013-2014]         20000 2011  Individual 2nd owner
## 6                    Honda CB Twister         18000 2010  Individual 1st owner
##   km_driven ex_showroom_price
## 1       350                NA
## 2      5650                NA
## 3     12000            148114
## 4     23000             89643
## 5     21000                NA
## 6     60000             53857

take a look at the dataset and clean the data

summary(df)
##      name           selling_price         year      seller_type       
##  Length:1061        Min.   :  5000   Min.   :1988   Length:1061       
##  Class :character   1st Qu.: 28000   1st Qu.:2011   Class :character  
##  Mode  :character   Median : 45000   Median :2015   Mode  :character  
##                     Mean   : 59638   Mean   :2014                     
##                     3rd Qu.: 70000   3rd Qu.:2017                     
##                     Max.   :760000   Max.   :2020                     
##                                                                       
##     owner             km_driven      ex_showroom_price
##  Length:1061        Min.   :   350   Min.   :  30490  
##  Class :character   1st Qu.: 13500   1st Qu.:  54852  
##  Mode  :character   Median : 25000   Median :  72752  
##                     Mean   : 34360   Mean   :  87959  
##                     3rd Qu.: 43000   3rd Qu.:  87032  
##                     Max.   :880000   Max.   :1278000  
##                                      NA's   :435

Since there are 435 NA’s in ex_showroom_price, we will just remove this column. We will also remove name column.

df$name <- NULL
df$ex_showroom_price <- NULL
head(df)
##   selling_price year seller_type     owner km_driven
## 1        175000 2019  Individual 1st owner       350
## 2         45000 2017  Individual 1st owner      5650
## 3        150000 2018  Individual 1st owner     12000
## 4         65000 2015  Individual 1st owner     23000
## 5         20000 2011  Individual 2nd owner     21000
## 6         18000 2010  Individual 1st owner     60000

Visualize data

library(ggplot2)
ggplot(data=df) + 
  geom_bar(mapping=aes(seller_type,fill=seller_type))

ggplot(data=df) + 
  geom_bar(mapping=aes(owner, fill=owner))

Explore the relationships between independent variables and dependent variable using plots

(1) km driven vs. selling price

plot(df$km_driven, df$selling_price, 
     xlab="km driven", ylab="selling price", main="km.driven vs. selling price") 

(2) year vs. selling price

plot(df$year, df$selling_price, 
     xlab="year", ylab="selling price", main="year vs. selling price") 

(3) seller type vs. selling price

ggplot(data=df) + 
  geom_boxplot(mapping=aes(x=seller_type, y=selling_price, fill=seller_type)) +
  labs(title = "seller_type vs. selling price")

(4) owner vs. selling price

ggplot(data=df) + 
  geom_boxplot(mapping=aes(x=owner, y=selling_price, fill=owner)) +
  labs(title = "owner vs. selling price")

Feature Encoding

df$seller_type <- as.factor(df$seller_type)
df$owner <- as.factor(df$owner)

pairwise scatter plot for quantitative variables

pairs(df, pch=20, cex=0.5, lower.panel=NULL)

Now we divide the dataset into training and testing datasets, so that we can build the model by fitting the training data and then predict on the testing data.

set.seed(123)
train_index <- sample(1:nrow(df), 0.7*nrow(df))
test_index <- setdiff(1:nrow(df), train_index)
train <- df[train_index,]
test <- df[test_index,]

nrow(train)
## [1] 742
nrow(test)
## [1] 319
model <- lm(selling_price~., data=df)
plot(model)

Part II

Model selection

Best subset regression

Select the subset of predictors that do the best at meeting some well-defined objective criterion. To be specific, we need to find the best subset of predictors having the largest adjusted R2 value or the smallest MSE, Mallow’s Cp, SBIC, SBC, or AIC. The best subset selected is (year, owner, km_driven).

library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
model <- lm(selling_price~., data=train)
best_subset <- ols_step_best_subset(model)
plot(best_subset)

best_subset
##             Best Subsets Regression            
## -----------------------------------------------
## Model Index    Predictors
## -----------------------------------------------
##      1         year                             
##      2         year owner                       
##      3         year owner km_driven             
##      4         year seller_type owner km_driven 
## -----------------------------------------------
## 
##                                                                 Subsets Regression Summary                                                                 
## -----------------------------------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                                                     
## Model    R-Square    R-Square    R-Square     C(p)         AIC           SBIC          SBC            MSEP              FPE              HSP          APC  
## -----------------------------------------------------------------------------------------------------------------------------------------------------------
##   1        0.1940      0.1929      0.1872    61.0863    17842.1728    15736.1720    17856.0008    1.198558e+12    1619660653.0136    2185800.7125    0.8103 
##   2        0.2478      0.2437        -Inf     9.7963    17796.9626    15687.2272    17824.6187    1.120143e+12    1519841008.1134    2051112.2534    0.7583 
##   3        0.2586      0.2536        -Inf     1.0535    17788.1967    15678.5673    17820.4622    1.105506e+12    1501997038.7967    2027060.2922    0.7494 
##   4        0.2587      0.2526        -Inf     3.0000    17790.1427    15680.5330    17827.0175    1.106928e+12    1505947180.4605    2032428.2782    0.7514 
## -----------------------------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria

Stepwise AIC regression

Build regression model from a set of candidate predictor variables by entering/removing predictors based on AIC, in a stepwise manner until there is no variable left to enter any more.

Forward selection

The subset selected is (year, owner, km_driven).

step_aic_forward <- ols_step_forward_aic(model, details=FALSE)
step_aic_forward
## 
##                                 Selection Summary                                  
## ----------------------------------------------------------------------------------
## Variable        AIC            Sum Sq             RSS          R-Sq      Adj. R-Sq 
## ----------------------------------------------------------------------------------
## year         17842.173    287770361404.616    1.195327e+12    0.19403      0.19294 
## owner        17796.963    367485077758.314    1.115612e+12    0.24778      0.24370 
## km_driven    17788.197         3.83555e+11    1.099542e+12    0.25862      0.25358 
## ----------------------------------------------------------------------------------
plot(step_aic_forward)

Backward selection

The subset selected is (year, owner, km_driven).

step_aic_backward <- ols_step_backward_aic(model, details = FALSE)
step_aic_backward
## 
## 
##                              Backward Elimination Summary                             
## ------------------------------------------------------------------------------------
## Variable          AIC           RSS              Sum Sq          R-Sq      Adj. R-Sq 
## ------------------------------------------------------------------------------------
## Full Model     17790.143    1.099462e+12    383634937685.570    0.25867      0.25262 
## seller_type    17788.197    1.099542e+12         3.83555e+11    0.25862      0.25358 
## ------------------------------------------------------------------------------------
plot(step_aic_backward)

Both direction selection

The subset selected is (year, owner, km_driven).

step_aic_both <- ols_step_both_aic(model, details = FALSE)
step_aic_both
## 
## 
##                                         Stepwise Summary                                        
## ----------------------------------------------------------------------------------------------
## Variable      Method        AIC           RSS              Sum Sq          R-Sq      Adj. R-Sq 
## ----------------------------------------------------------------------------------------------
## year         addition    17842.173    1.195327e+12    287770361404.616    0.19403      0.19294 
## owner        addition    17796.963    1.115612e+12    367485077758.314    0.24778      0.24370 
## km_driven    addition    17788.197    1.099542e+12         3.83555e+11    0.25862      0.25358 
## ----------------------------------------------------------------------------------------------
plot(step_aic_both)

Stepwise P-value regression

Forward selection

The subset selected is (year, owner, km_driven).

step_p_forward <- ols_step_forward_p(model, details=FALSE)
step_p_forward
## 
##                                Selection Summary                                 
## --------------------------------------------------------------------------------
##         Variable                   Adj.                                             
## Step     Entered     R-Square    R-Square     C(p)         AIC           RMSE       
## --------------------------------------------------------------------------------
##    1    year           0.1940      0.1929    61.0863    17842.1728    40190.8786    
##    2    owner          0.2478      0.2437     9.7963    17796.9626    38906.5656    
##    3    km_driven      0.2586      0.2536     1.0535    17788.1967    38651.5645    
## --------------------------------------------------------------------------------
plot(step_p_forward)

Backward selection

The subset selected is (year, owner, km_driven).

step_p_backward <- ols_step_backward_p(model, details = FALSE)
step_p_backward
## 
## 
##                                Elimination Summary                                
## ---------------------------------------------------------------------------------
##         Variable                     Adj.                                            
## Step      Removed      R-Square    R-Square     C(p)        AIC           RMSE       
## ---------------------------------------------------------------------------------
##    1    seller_type      0.2586      0.2536    1.0535    17788.1967    38651.5645    
## ---------------------------------------------------------------------------------
plot(step_p_backward)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Both direction selection

The subset selected is (year, owner, km_driven).

step_p_both <- ols_step_both_p(model, details = FALSE)
step_p_both
## 
##                                  Stepwise Selection Summary                                  
## --------------------------------------------------------------------------------------------
##                       Added/                   Adj.                                             
## Step    Variable     Removed     R-Square    R-Square     C(p)         AIC           RMSE       
## --------------------------------------------------------------------------------------------
##    1      year       addition       0.194       0.193    61.0860    17842.1728    40190.8786    
##    2      owner      addition       0.248       0.244     9.7960    17796.9626    38906.5656    
##    3    km_driven    addition       0.259       0.254     1.0530    17788.1967    38651.5645    
## --------------------------------------------------------------------------------------------
plot(step_p_both)

Conclusion: The model selected includes year, owner, and km_driven as predictor variables. All methods applied above agree with this result.

Data transformation

Box-Cox transformation

Use Box-Cox transformation to determine the most appropriate transformation of the response for correcting skewness of the distributions of error terms, unequal error variances, and nonlinearity of the regression function.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
## 
##     cement
attach(train)
b <- boxcox(selling_price~year+owner+km_driven)

lik <- b$y
bc <- cbind(b$x, b$y)
sorted <- bc[order(-lik),]
head(sorted, 5)
##            [,1]      [,2]
## [1,] -0.1818182 -1971.339
## [2,] -0.2222222 -1971.479
## [3,] -0.1414141 -1972.253
## [4,] -0.2626263 -1972.693
## [5,] -0.1010101 -1974.196

The lambda value for the maximum log likelihood for obtaining minimum SSE is -0.18.

train$selling_price <- (train$selling_price)^-0.18
model <- lm(selling_price~year+owner+km_driven, data=train)
plot(model)
## Warning: not plotting observations with leverage one:
##   737

Part III: Model Check

Check the summary of the model

We need to check if the t-value and F-value are statistically significant.

model <- lm(selling_price~year+owner+km_driven, data=train)
summary(model)
## 
## Call:
## lm(formula = selling_price ~ year + owner + km_driven, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.081950 -0.006937  0.002079  0.008788  0.040967 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.505e+00  2.616e-01  21.042  < 2e-16 ***
## year           -2.661e-03  1.298e-04 -20.498  < 2e-16 ***
## owner2nd owner  5.233e-04  1.552e-03   0.337  0.73609    
## owner3rd owner -1.445e-02  4.452e-03  -3.245  0.00123 ** 
## owner4th owner -4.393e-02  1.380e-02  -3.184  0.00151 ** 
## km_driven       4.169e-08  1.103e-08   3.779  0.00017 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01378 on 736 degrees of freedom
## Multiple R-squared:  0.4352, Adjusted R-squared:  0.4314 
## F-statistic: 113.4 on 5 and 736 DF,  p-value: < 2.2e-16

Here, since all of the p-value are smaller than alpha=0.05 (except the owner2nd owner), all of the t-value are statistically significant (except the owner2nd owner).

anova(model)
## Analysis of Variance Table
## 
## Response: selling_price
##            Df   Sum Sq  Mean Sq  F value    Pr(>F)    
## year        1 0.100740 0.100740 530.2739 < 2.2e-16 ***
## owner       3 0.004295 0.001432   7.5353 5.702e-05 ***
## km_driven   1 0.002713 0.002713  14.2798 0.0001703 ***
## Residuals 736 0.139823 0.000190                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here, since all of the p-value are smaller than alpha=0.05, all of the F-value are statistically significant.

Calculate the residuals

e = resid(model)
head(e)
##          415          463          179          526          195          938 
##  0.005123157  0.003921742  0.012618321  0.004259661  0.003363994 -0.005632955

Here, the residuals in the first six rows are displayed.

Check residuals and assumptions considered in linear regression

We will check the following assumptions of the linear regression:

  • assumption - linearity
  • assumption - constancy of error variance
  • assumption - independence of error terms
  • assumption - normality of error terms
  • assumption - absence of outliers

1. residual plot

  • residual vs fitted value
expected_price=model$fitted
plot(expected_price, e, main="residual vs expected price", xlab="expected price", ylab="residual")
abline(h=0, lty=2)

  • residual vs predictor variables
plot(year, e, main="residual vs year", xlab="year", ylab="residual")
abline(h=0, lty=2)

plot(year, abs(e), main="absolute residual vs year", xlab="year", ylab="absolute residual")

plot(owner, e, main="residual vs owner", xlab="owner", ylab="residual")
abline(h=0, lty=2)

plot(owner, abs(e), main="absolute residual vs owner", xlab="owner", ylab="absolute residual")

plot(km_driven, e, main="residual vs km_driven", xlab="km_driven", ylab="residual")
abline(h=0, lty=2)

plot(km_driven, abs(e), main="absolute residual vs km_driven", xlab="km_driven", ylab="absolute residual")

Based on the residual plot (residual vs expected price/ year/ owner/ km_driven), we can see that nearly all the points lie near the horizontal line when residual equal to 0. Thus, we can conclude that the assumption of linearity, the assumption of constancy of error variance, and the assumption of independence of error terms are not violated.

Based on the plot of absolute value of residual (residual vs year/ owner/ km_driven), we can see that the error variance is nearly constant. Thus, we can conclude that the assumption of constancy of error variance is not violated.

2. distribution plot of residual

  • boxplot of residual
boxplot(e, main="boxplot of residual")

  • histogram of residual
hist(e, main="histogram of residual")

Based on the boxplot and histogram of residual, we can see that the median of residuals lie near 0, but there exists several outliers.

3. normal probability plot

qqnorm(rstandard(model))
abline(0,1)

Based on the normal probability plot, we can conclude that the assumption of normality of error terms is not violated, since the graph appears nearly to be a linear line.

4. correlation test for normality

res.order <- sort(e)
n <- nrow(train)
mse <- sum(e^2)/(n-2)
res.exp <- sqrt(mse)*qnorm((1:n-0.375)/(n+0.25))
cor(res.exp, res.order)
## [1] 0.9805561

Based on the correlation test for normality, we can conclude that the assumption of normality of error terms is not violated, since the correlation is nearly 1, showing the strong linear relationship.

plot(res.exp, res.order, main="normal probability plot of the residual", xlab="expected residual", ylab="residual")
abline(0,1)

The normal probability plot of the residual (the relationship between ordered residual and expected residual) is displayed above.

5. Breusch-Pagan test (BP test)

residual variance model: log(sigma^2)=r(intercept)+r(year)year+r(owner)owner+r(km_driven)km_driven

Hypothesis: H0: r(year)=r(owner)=r(km_driven)=0; Ha: not all r equal to 0.

Decision rule: when BP test statistics > chi-squared(1-\(\alpha\),5), reject H0 and conclude Ha; when BP test statistics < chi-squared(1-alpha,5), fail to reject H0 and conclude H0.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(model, studentize=FALSE)
## 
##  Breusch-Pagan test
## 
## data:  model
## BP = 233.01, df = 5, p-value < 2.2e-16
qchisq(0.95, df=5)
## [1] 11.0705
qchisq(0.99, df=5)
## [1] 15.08627

Conclusion: BP test statistics=233.01 > chi-squared(1-\(\alpha\),5) whenever \(\alpha\)=0.05 or 0.01, so reject H0 and conclude Ha.

Based on BP test, we can conclude that the assumption of constancy of error variance is violated.

6. F test for lack of fit

Hypothesis: \(H_0\):E(selling_price)=\(\beta\)(intercept)+\(\beta\)(year)year+\(\beta\)(owner)owner+\(\beta\)(km_driven)km_driven; \(H_a\):E(selling_price)!=\(\beta\)(intercept)+\(\beta\)(year)year+\(\beta\)(owner)owner+\(\beta\)(km_driven)km_driven

Decision rule: when F statistics > F(1-\(\alpha\),736,481), reject H0 and conclude Ha; when F statistics < F(1-alpha,736,481), fail to reject H0 and conclude H0.

sum(duplicated(train[,-c(1,3)]))
## [1] 189

Since the duplicated data exists, we can conduct F test for lack of fit.

reduced = lm(selling_price~year+owner+km_driven, data=train)
full = lm(selling_price~0+as.factor(year)+as.factor(owner)+as.factor(km_driven), data=train)
anova(reduced, full)
## Analysis of Variance Table
## 
## Model 1: selling_price ~ year + owner + km_driven
## Model 2: selling_price ~ 0 + as.factor(year) + as.factor(owner) + as.factor(km_driven)
##   Res.Df      RSS  Df Sum of Sq      F    Pr(>F)    
## 1    736 0.139823                                   
## 2    481 0.072463 255   0.06736 1.7534 7.528e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qf(0.95, df1=736, df=481)
## [1] 1.147556
qf(0.99, df1=736, df=481)
## [1] 1.215212

Conclusion: F statistics=\(1.7534 > F(1-\alpha,736,481)\) whenever \(\alpha\)=0.05 or 0.01, so reject H0 and conclude Ha.

Based on F test for lack of fit, we can conclude that the relationship assumed in the model is not reasonable (i.e. there is lack of fit). Thus, we can conclude that the assumption of linearity is violated.

Part IV: Test Model First we take a loot at testing data and the original model:

test$selling_price <- (test$selling_price)^-0.18
model <- lm(selling_price~year+owner+km_driven, data=train)
plot(test)

summary(model)
## 
## Call:
## lm(formula = selling_price ~ year + owner + km_driven, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.081950 -0.006937  0.002079  0.008788  0.040967 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.505e+00  2.616e-01  21.042  < 2e-16 ***
## year           -2.661e-03  1.298e-04 -20.498  < 2e-16 ***
## owner2nd owner  5.233e-04  1.552e-03   0.337  0.73609    
## owner3rd owner -1.445e-02  4.452e-03  -3.245  0.00123 ** 
## owner4th owner -4.393e-02  1.380e-02  -3.184  0.00151 ** 
## km_driven       4.169e-08  1.103e-08   3.779  0.00017 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01378 on 736 degrees of freedom
## Multiple R-squared:  0.4352, Adjusted R-squared:  0.4314 
## F-statistic: 113.4 on 5 and 736 DF,  p-value: < 2.2e-16
plot(model)
## Warning: not plotting observations with leverage one:
##   737

Add prediction data into our testing data set:

test$predicted_value <- predict(model, test)
head(test)
##    selling_price year seller_type     owner km_driven predicted_value
## 1     0.11382906 2019  Individual 1st owner       350       0.1319192
## 3     0.11703172 2018  Individual 1st owner     12000       0.1350660
## 7     0.13149931 2018  Individual 1st owner     17000       0.1352744
## 9     0.15635730 2010  Individual 1st owner     32000       0.1571890
## 12    0.15831117 2016  Individual 2nd owner     10000       0.1408282
## 14    0.09972115 2019  Individual 1st owner      1127       0.1319516
test.2 <- subset(test,select = -c(seller_type,selling_price))
head(test.2)
##    year     owner km_driven predicted_value
## 1  2019 1st owner       350       0.1319192
## 3  2018 1st owner     12000       0.1350660
## 7  2018 1st owner     17000       0.1352744
## 9  2010 1st owner     32000       0.1571890
## 12 2016 2nd owner     10000       0.1408282
## 14 2019 1st owner      1127       0.1319516

Comparing exact data and expected data:

comparison <- data.frame(test$selling_price, test$predicted_value)
head(comparison)
##   test.selling_price test.predicted_value
## 1         0.11382906            0.1319192
## 2         0.11703172            0.1350660
## 3         0.13149931            0.1352744
## 4         0.15635730            0.1571890
## 5         0.15831117            0.1408282
## 6         0.09972115            0.1319516

Calculating Mean Square Prediction Error (MSPE):

prediction.error <- ((comparison$test.predicted_value - comparison$test.selling_price)^2 / comparison$test.selling_price)
MSPE <- (sum(prediction.error)/319)
MSPE
## [1] 0.001426276
predictions <- predict(model, newdata=test, interval="prediction")
head(predictions)
##          fit       lwr       upr
## 1  0.1319192 0.1048118 0.1590266
## 3  0.1350660 0.1079696 0.1621624
## 7  0.1352744 0.1081786 0.1623703
## 9  0.1571890 0.1300867 0.1842914
## 12 0.1408282 0.1136050 0.1680514
## 14 0.1319516 0.1048444 0.1590588

Graph: Comparing expected selling price and exact selling price:

library(ggplot2)
ggplot(data = comparison)+
  geom_point(mapping = aes(x = test.selling_price, y = test.predicted_value))+
  geom_smooth(method = "lm", mapping = aes(x = test.selling_price, y = test.predicted_value))+
  geom_abline(intercept = 0, slope = 1)
## `geom_smooth()` using formula 'y ~ x'

Obtain the studentized deleted residuals and identify any outlying Y observations. Using Bonferonni outlier test procedure with \(\alpha = 0.1\).

n <- dim(test) [1]
p <- dim(test) [2]
critical_value <- qt(1-0.1/(2*n),n-p-1)
which(rstudent(model)>critical_value) 
## named integer(0)

Decision rule: if \(|t_i| ≤ t(1 − 0.1/(2n), n − p − 1) = 3.27\), we conclude no outliers. In this case, we do not have outliers.

Obtain the diagonal elements of the hat matrix. Identify any outlying X observations:

hii = hatvalues(model)
which(hii > 2*p/n)
##  555  878  862 1057  429   40  458  797  364  203   29  668  798  576  478  940 
##   21  115  117  166  193  198  208  247  369  512  586  612  625  628  676  690 
##  869  312 
##  708  737

Testing Influences:

model.test <- lm(predicted_value~year+owner+km_driven, data=test.2)
COOKS = cooks.distance(model.test)
DFBETAS = dfbetas(model.test)
DFFITS = as.numeric(dffits(model.test))
target = c(1:319)
Output <- cbind(DFFITS,DFBETAS,COOKS)[target, ]
head(Output)
##           DFFITS  (Intercept)          year owner2nd owner owner3rd owner
## 1  -67.848592748 43.405796852 -43.539701764   8.6921317704  -4.8930515239
## 3    0.008277841 -0.004938085   0.004954818  -0.0013788688   0.0005132043
## 7    0.007032188 -0.004337553   0.004351180  -0.0012023314   0.0004751577
## 9    0.002568742  0.001741474  -0.001737424  -0.0007110215  -0.0004349891
## 12   0.014210576 -0.002591479   0.002594759   0.0133816714   0.0004197871
## 14   0.014890169 -0.009582025   0.009611287  -0.0019187743   0.0010868761
##    owner4th owner     km_driven        COOKS
## 1   -1.345500e+01 16.9555908316 4.164025e-01
## 3    9.866722e-04 -0.0011481516 1.145663e-05
## 7    5.327688e-04 -0.0004350138 8.268143e-06
## 9   -2.321329e-04 -0.0002476203 1.103261e-06
## 12   1.541523e-03 -0.0019449063 3.376405e-05
## 14   2.870989e-03 -0.0035735154 3.706798e-05
2*sqrt(p/n)
## [1] 0.2742902
2/sqrt(n)
## [1] 0.1119785
plot(qf(COOKS[target],p, n-p))